In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
In [2]:
automobile=pd.read_csv(r"E:\Ankit Jain\D drive\Aviraj Personal File\IMS Analytics Class\Github sets\Automobile\Automobile.csv")
automobile.sample(15).T
Out[2]:
156 191 171 19 185 39 40 98 157 107 170 55 17 194 87
symboling 0 -1 -1 0 3 0 0 0 0 0 -1 3 2 -2 1
normalized_losses 91 74 65 81 137 85 85 128 91 161 65 150 121 103 128
make toyota volvo toyota chevrolet volkswagen honda honda nissan toyota peugot toyota mazda chevrolet volvo nissan
fuel_type gas gas gas gas gas gas gas gas gas diesel diesel gas gas gas diesel
aspiration std std std std std std std std std turbo turbo std std turbo std
number_of_doors four four four four two four four four four four four two two four two
body_style sedan wagon hatchback sedan convertible sedan sedan sedan hatchback wagon sedan hatchback hatchback sedan sedan
drive_wheels fwd rwd fwd fwd fwd fwd fwd fwd fwd rwd fwd rwd fwd rwd fwd
engine_location front front front front front front front front front front front front front front front
wheel_base 95.7 104.3 102.4 94.5 94.5 96.5 96.5 100.4 95.7 114.2 102.4 95.3 88.4 104.3 94.5
length 166.3 188.8 175.6 158.8 159.3 175.4 175.4 181.7 166.3 198.9 175.6 169 141.1 188.8 165.3
width 64.4 67.2 66.5 63.6 64.2 62.5 65.2 66.5 64.4 68.4 66.5 65.7 60.3 67.2 63.8
height 53 57.5 53.9 52 55.6 54.1 54.1 55.1 52.8 58.7 54.9 49.6 53.2 56.2 54.5
curb_weight 2094 3034 2414 1909 2254 2372 2465 3095 2122 3430 2480 2500 1488 3045 2017
engine_type ohc ohc ohc ohc ohc ohc ohc ohcv ohc l ohc rotor l ohc ohc
number_of_cylinders four four four four four four four six four four four two three four four
engine_size 98 141 122 90 109 110 110 181 98 152 110 80 61 130 103
fuel_system 2bbl mpfi mpfi 2bbl mpfi 1bbl mpfi mpfi 2bbl idi idi mpfi 2bbl mpfi idi
bore 3.19 3.78 3.31 3.03 3.19 3.15 3.15 3.43 3.19 3.7 3.27 3.28 2.91 3.62 2.99
stroke 3.03 3.15 3.54 3.11 3.4 3.58 3.58 3.27 3.03 3.52 3.35 3.5 3.03 3.15 3.47
compression_ratio 9 9.5 8.7 9.6 8.5 9 9 9 9 21 22.5 9.4 9.5 7.5 21.9
horsepower 70 114 92 70 90 86 101 152 70 95 73 135 48 162 55
peak_rpm 4800 5400 4200 5400 5500 5800 5800 5200 4800 4150 4500 6000 5100 5100 4800
city_mpg 38 23 27 38 24 27 24 17 28 25 30 16 47 17 45
highway_mpg 47 28 32 43 29 33 28 22 34 25 33 23 53 22 50
price 7738 13415 9988 6575 11595 10295 12945 13499 8358 13860 10698 15645 5151 18420 7099
In [3]:
automobile.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   symboling            201 non-null    int64  
 1   normalized_losses    201 non-null    int64  
 2   make                 201 non-null    object 
 3   fuel_type            201 non-null    object 
 4   aspiration           201 non-null    object 
 5   number_of_doors      201 non-null    object 
 6   body_style           201 non-null    object 
 7   drive_wheels         201 non-null    object 
 8   engine_location      201 non-null    object 
 9   wheel_base           201 non-null    float64
 10  length               201 non-null    float64
 11  width                201 non-null    float64
 12  height               201 non-null    float64
 13  curb_weight          201 non-null    int64  
 14  engine_type          201 non-null    object 
 15  number_of_cylinders  201 non-null    object 
 16  engine_size          201 non-null    int64  
 17  fuel_system          201 non-null    object 
 18  bore                 201 non-null    float64
 19  stroke               201 non-null    float64
 20  compression_ratio    201 non-null    float64
 21  horsepower           201 non-null    int64  
 22  peak_rpm             201 non-null    int64  
 23  city_mpg             201 non-null    int64  
 24  highway_mpg          201 non-null    int64  
 25  price                201 non-null    int64  
dtypes: float64(7), int64(9), object(10)
memory usage: 41.0+ KB
In [4]:
automobile.shape
Out[4]:
(201, 26)
In [5]:
automobile.describe().T
Out[5]:
count mean std min 25% 50% 75% max
symboling 201.0 0.840796 1.254802 -2.00 0.00 1.00 2.00 3.00
normalized_losses 201.0 125.189055 33.572966 65.00 101.00 122.00 150.00 256.00
wheel_base 201.0 98.797015 6.066366 86.60 94.50 97.00 102.40 120.90
length 201.0 174.200995 12.322175 141.10 166.80 173.20 183.50 208.10
width 201.0 65.889055 2.101471 60.30 64.10 65.50 66.60 72.00
height 201.0 53.766667 2.447822 47.80 52.00 54.10 55.50 59.80
curb_weight 201.0 2555.666667 517.296727 1488.00 2169.00 2414.00 2926.00 4066.00
engine_size 201.0 126.875622 41.546834 61.00 98.00 120.00 141.00 326.00
bore 201.0 3.329701 0.268166 2.54 3.15 3.31 3.58 3.94
stroke 201.0 3.261741 0.317875 2.07 3.11 3.29 3.46 4.17
compression_ratio 201.0 10.164279 4.004965 7.00 8.60 9.00 9.40 23.00
horsepower 201.0 103.263682 37.389372 48.00 70.00 95.00 116.00 262.00
peak_rpm 201.0 5121.393035 479.624905 4150.00 4800.00 5200.00 5500.00 6600.00
city_mpg 201.0 25.179104 6.423220 13.00 19.00 24.00 30.00 49.00
highway_mpg 201.0 30.686567 6.815150 16.00 25.00 30.00 34.00 54.00
price 201.0 13207.129353 7947.066342 5118.00 7775.00 10295.00 16500.00 45400.00
In [6]:
for column in automobile:
    print(automobile[column].unique())
    print()
[ 3  1  2  0 -1 -2]

[168 164 161 158 192 188 149 121  98  81 118 148 110 145 137 101  78 106
  85 107 115 104 113 150 129 119 105  93 122 142 140 153 139 125 128 103
 108 194 231 154  74 141 186  83 102  89  87  77  91 134  65 197  90 133
  94 256 132  95]

['alfa-romero' 'audi' 'bmw' 'chevrolet' 'dodge' 'honda' 'isuzu' 'jaguar'
 'mazda' 'mercedes-benz' 'mercury' 'mitsubishi' 'nissan' 'peugot'
 'plymouth' 'porsche' 'renault' 'saab' 'subaru' 'toyota' 'volkswagen'
 'volvo']

['gas' 'diesel']

['std' 'turbo']

['two' 'four']

['convertible' 'hatchback' 'sedan' 'wagon' 'hardtop']

['rwd' 'fwd' '4wd']

['front' 'rear']

[ 88.6  94.5  99.8  99.4 105.8 101.2 103.5 110.   88.4  93.7 103.3  95.9
  86.6  96.5  94.3  96.  113.  102.   93.1  95.3  98.8 104.9 106.7 115.6
  96.6 120.9 112.  102.7  93.   96.3  95.1  97.2 100.4  91.3  99.2 107.9
 114.2 108.   89.5  96.1  99.1  93.3  97.   96.9  95.7  98.4 102.4 102.9
 104.5  97.3 104.3 109.1]

[168.8 171.2 176.6 177.3 192.7 176.8 189.  193.8 197.  141.1 155.9 158.8
 157.3 174.6 173.2 144.6 150.  163.4 157.1 167.5 175.4 169.1 170.7 172.6
 199.6 191.7 159.1 166.8 169.  177.8 175.  190.9 187.5 202.6 180.3 208.1
 199.2 178.4 173.  172.4 165.3 170.2 165.6 162.4 173.4 181.7 184.6 178.5
 186.7 198.9 167.3 168.9 181.5 186.6 156.9 157.9 172.  173.5 173.6 158.7
 169.7 166.3 168.7 176.2 175.6 183.5 187.8 171.7 159.3 165.7 180.2 183.1
 188.8]

[64.1 65.5 66.2 66.4 66.3 71.4 64.8 66.9 67.9 70.9 60.3 63.6 63.8 64.6
 63.9 64.  65.2 62.5 66.  61.8 69.6 70.6 64.2 65.7 66.5 66.1 70.3 71.7
 70.5 72.  68.  64.4 65.4 68.4 68.3 65.  66.6 63.4 65.6 67.7 67.2 68.9
 68.8]

[48.8 52.4 54.3 53.1 55.7 55.9 53.7 56.3 53.2 52.  50.8 50.6 59.8 50.2
 52.6 54.5 58.3 53.3 54.1 51.  53.5 51.4 52.8 47.8 49.6 55.5 54.4 56.5
 58.7 54.9 56.7 55.4 54.8 49.4 51.6 54.7 55.1 56.1 49.7 56.  55.2 50.5
 52.5 53.  59.1 53.9 55.6 56.2 57.5]

[2548 2823 2337 2824 2507 2844 2954 3086 2395 2710 2765 3055 3230 3380
 3505 1488 1874 1909 1876 2128 1967 1989 2191 2535 2811 1713 1819 1837
 1940 1956 2010 2024 2236 2289 2304 2372 2465 2293 2734 4066 3950 1890
 1900 1905 1945 1950 2380 2385 2500 2410 2443 2425 2670 2700 3515 3750
 3495 3770 3740 3685 3900 3715 2910 1918 1944 2004 2145 2370 2328 2833
 2921 2926 2365 2405 2403 1889 2017 1938 1951 2028 1971 2037 2008 2324
 2302 3095 3296 3060 3071 3139 3020 3197 3430 3075 3252 3285 3485 3130
 2818 2778 2756 2800 2579 2460 2658 2695 2707 2758 2808 2847 2050 2120
 2240 2190 2340 2510 2290 2455 2420 2650 1985 2040 2015 2280 3110 2081
 2109 2275 2094 2122 2140 2169 2204 2265 2300 2540 2536 2551 2679 2714
 2975 2326 2480 2414 2458 2976 3016 3131 3151 2261 2209 2264 2212 2319
 2254 2221 2661 2563 2912 3034 2935 3042 3045 3157 2952 3049 3012 3217
 3062]

['dohc' 'ohcv' 'ohc' 'l' 'rotor' 'ohcf']

['four' 'six' 'five' 'three' 'twelve' 'two' 'eight']

[130 152 109 136 131 108 164 209  61  90  98 122 156  92  79 110 111 119
 258 326  91  70  80 140 134 183 234 308 304  97 103 120 181 151 194 132
 121 146 171 161 141 173 145]

['mpfi' '2bbl' 'mfi' '1bbl' 'spfi' '4bbl' 'idi' 'spdi']

[3.47 2.68 3.19 3.13 3.5  3.31 3.62 2.91 3.03 2.97 3.34 3.6  2.92 3.15
 3.43 3.63 3.54 3.08 3.28 3.39 3.76 3.58 3.46 3.8  3.78 3.17 3.35 3.59
 2.99 3.33 3.7  3.61 3.94 3.74 2.54 3.05 3.27 3.24 3.01]

[2.68 3.47 3.4  2.8  3.19 3.39 3.03 3.11 3.23 3.46 3.9  3.41 3.07 3.58
 4.17 2.76 3.15 3.5  3.16 3.64 3.1  3.35 3.12 3.86 3.29 3.27 3.52 2.19
 3.21 2.9  2.07 2.36 2.64 3.08 3.54 2.87]

[ 9.   10.    8.    8.5   8.3   8.8   9.5   9.6   9.41  9.4   7.6   7.
  9.2  10.1   9.1   8.1  11.5   8.6  22.7  22.   21.5   7.5  21.9   7.8
  8.4  21.    8.7   9.31  9.3   7.7  22.5  23.  ]

[111 154 102 115 110 140 101 121 182  48  70  68  88 145  58  76  60  86
 100  78  90 176 262 135  84  64 120  72 123 155 184 175 116  69  55  97
 152 160 200  95 142 143 207  73  82  94  62  56 112  92 161 156  52  85
 114 162 134 106]

[5000 5500 5800 4250 5400 5100 4800 6000 4750 4650 4200 4350 4500 5200
 4150 5600 5900 5250 4900 4400 6600 5300]

[21 19 24 18 17 23 20 16 15 47 38 37 31 49 30 27 25 13 26 36 22 14 45 28
 32 35 34 29 33]

[27 26 30 22 25 20 29 28 53 43 41 38 24 54 42 34 33 31 19 17 23 32 39 18
 16 37 50 36 47 46]

[13495 16500 13950 17450 15250 17710 18920 23875 16430 16925 20970 21105
 24565 30760 41315 36880  5151  6295  6575  5572  6377  7957  6229  6692
  7609  8558  8921 12964  6479  6855  5399  6529  7129  7295  7895  9095
  8845 10295 12945 10345  6785 11048 32250 35550 36000  5195  6095  6795
  6695  7395 10945 11845 13645 15645  8495 10595 10245 10795 11245 18280
 18344 25552 28248 28176 31600 34184 35056 40960 45400 16503  5389  6189
  6669  7689  9959  8499 12629 14869 14489  6989  8189  9279  5499  7099
  6649  6849  7349  7299  7799  7499  7999  8249  8949  9549 13499 14399
 17199 19699 18399 11900 13200 12440 13860 15580 16900 16695 17075 16630
 17950 18150 12764 22018 32528 34028 37028  9295  9895 11850 12170 15040
 15510 18620  5118  7053  7603  7126  7775  9960  9233 11259  7463 10198
  8013 11694  5348  6338  6488  6918  7898  8778  6938  7198  7788  7738
  8358  9258  8058  8238  9298  9538  8449  9639  9989 11199 11549 17669
  8948 10698  9988 10898 11248 16558 15998 15690 15750  7975  7995  8195
  9495  9995 11595  9980 13295 13845 12290 12940 13415 15985 16515 18420
 18950 16845 19045 21485 22470 22625]

In [7]:
for string in automobile:
    if automobile[string].dtypes=="object":
        print(automobile[string].value_counts())
        print()
toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
subaru           12
volkswagen       12
peugot           11
volvo            11
dodge             9
bmw               8
mercedes-benz     8
plymouth          7
audi              6
saab              6
porsche           4
chevrolet         3
jaguar            3
alfa-romero       3
renault           2
isuzu             2
mercury           1
Name: make, dtype: int64

gas       181
diesel     20
Name: fuel_type, dtype: int64

std      165
turbo     36
Name: aspiration, dtype: int64

four    114
two      87
Name: number_of_doors, dtype: int64

sedan          94
hatchback      68
wagon          25
hardtop         8
convertible     6
Name: body_style, dtype: int64

fwd    118
rwd     75
4wd      8
Name: drive_wheels, dtype: int64

front    198
rear       3
Name: engine_location, dtype: int64

ohc      145
ohcf      15
ohcv      13
l         12
dohc      12
rotor      4
Name: engine_type, dtype: int64

four      157
six        24
five       10
two         4
eight       4
three       1
twelve      1
Name: number_of_cylinders, dtype: int64

mpfi    92
2bbl    64
idi     20
1bbl    11
spdi     9
4bbl     3
mfi      1
spfi     1
Name: fuel_system, dtype: int64

In [8]:
sns.pairplot(automobile)
Out[8]:
<seaborn.axisgrid.PairGrid at 0x2678d0d0f70>
In [9]:
plt.figure(figsize=(15,8))
sns.heatmap(automobile.corr(),annot=True)
Out[9]:
<AxesSubplot:>
In [10]:
import plotly.express as px
px.scatter(automobile,x="city_mpg",y="peak_rpm")
In [11]:
px.scatter(automobile,x="width",y="height")
In [12]:
sns.displot(automobile["price"])
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x2679b606b80>
In [13]:
plt.figure(figsize=(15,5))
sns.barplot(data=automobile,x="number_of_cylinders",y="price")
Out[13]:
<AxesSubplot:xlabel='number_of_cylinders', ylabel='price'>
In [14]:
pd.crosstab(automobile["number_of_cylinders"],automobile["make"],margins=True).T
Out[14]:
number_of_cylinders eight five four six three twelve two All
make
alfa-romero 0 0 2 1 0 0 0 3
audi 0 5 1 0 0 0 0 6
bmw 0 0 2 6 0 0 0 8
chevrolet 0 0 2 0 1 0 0 3
dodge 0 0 9 0 0 0 0 9
honda 0 0 13 0 0 0 0 13
isuzu 0 0 2 0 0 0 0 2
jaguar 0 0 0 2 0 1 0 3
mazda 0 0 13 0 0 0 4 17
mercedes-benz 4 4 0 0 0 0 0 8
mercury 0 0 1 0 0 0 0 1
mitsubishi 0 0 13 0 0 0 0 13
nissan 0 0 12 6 0 0 0 18
peugot 0 0 11 0 0 0 0 11
plymouth 0 0 7 0 0 0 0 7
porsche 0 0 1 3 0 0 0 4
renault 0 0 2 0 0 0 0 2
saab 0 0 6 0 0 0 0 6
subaru 0 0 12 0 0 0 0 12
toyota 0 0 28 4 0 0 0 32
volkswagen 0 1 11 0 0 0 0 12
volvo 0 0 9 2 0 0 0 11
All 4 10 157 24 1 1 4 201
In [15]:
automobile["normalized_losses"].max(),automobile["normalized_losses"].min()
Out[15]:
(256, 65)
In [16]:
loss=pd.cut(automobile["normalized_losses"],[60,90,120,150,180,210,240,270])
automobile.pivot_table(values=["price","engine_size","compression_ratio","stroke","bore"],index=["make",loss],margins=True)
Out[16]:
bore compression_ratio engine_size price stroke
make normalized_losses
alfa-romero (150, 180] 3.206667 9.000000 137.333333 15498.333333 2.943333
audi (150, 180] 3.180000 8.633333 130.666667 17859.166667 3.400000
bmw (120, 150] 3.542500 8.250000 197.750000 33380.000000 3.340000
(180, 210] 3.405000 8.900000 136.000000 18857.500000 2.995000
chevrolet (60, 90] 3.030000 9.600000 90.000000 6575.000000 3.110000
(90, 120] 3.030000 9.600000 90.000000 6295.000000 3.110000
(120, 150] 2.910000 9.500000 61.000000 5151.000000 3.030000
dodge (90, 120] 3.077500 8.727500 100.000000 7206.750000 3.327500
(120, 150] 3.108000 8.560000 104.800000 8410.400000 3.396000
honda (60, 90] 3.092500 9.050000 105.500000 9845.000000 3.537500
(90, 120] 3.012857 9.257143 97.857143 7669.571429 3.434286
(120, 150] 2.910000 9.400000 92.000000 6667.000000 3.410000
isuzu (90, 120] 3.370000 8.850000 115.000000 8916.500000 3.230000
jaguar (90, 120] 3.585000 9.800000 292.000000 35775.000000 3.465000
(120, 150] 3.630000 8.100000 258.000000 32250.000000 4.170000
mazda (90, 120] 3.268182 11.227273 110.636364 9961.727273 3.282727
(120, 150] 3.316667 9.133333 89.000000 11920.000000 3.463333
mercedes-benz (90, 120] 3.580000 21.500000 183.000000 28394.000000 3.640000
(120, 150] 3.630000 8.150000 270.000000 38900.000000 3.225000
mercury (120, 150] 3.780000 8.000000 140.000000 16503.000000 3.120000
mitsubishi (120, 150] 3.400000 7.571429 133.142857 10817.571429 3.631429
(150, 180] 3.076667 8.633333 101.000000 7399.000000 3.333333
nissan (90, 120] 3.303333 8.966667 132.666667 10290.666667 3.343333
(120, 150] 3.165000 10.912500 108.250000 7774.000000 3.310000
(150, 180] 3.150000 9.400000 97.000000 8249.000000 3.290000
(180, 210] 3.430000 8.400000 181.000000 18449.000000 3.270000
(210, 240] 3.430000 9.000000 181.000000 18399.000000 3.270000
peugot (150, 180] 3.582727 14.000000 135.818182 15489.090909 3.160000
plymouth (60, 90] 3.350000 8.500000 122.000000 8921.000000 3.460000
(90, 120] 3.000000 8.500000 94.000000 6764.500000 3.310000
(120, 150] 3.590000 7.000000 156.000000 12764.000000 3.860000
(150, 180] 2.970000 9.400000 92.666667 6843.333333 3.230000
porsche (120, 150] 3.740000 9.500000 194.000000 34528.000000 2.900000
(180, 210] 3.940000 9.500000 151.000000 22018.000000 3.110000
renault (120, 150] 3.460000 8.700000 132.000000 9595.000000 3.900000
saab (90, 120] 3.540000 9.200000 121.000000 15433.333333 3.070000
(120, 150] 3.206667 9.203333 121.000000 15013.333333 2.736667
subaru (60, 90] 3.620000 8.728571 106.428571 8163.142857 2.600000
(90, 120] 3.620000 8.940000 108.000000 9070.600000 2.640000
toyota (60, 90] 3.184545 10.136364 111.727273 9132.727273 3.273636
(90, 120] 3.192500 12.375000 100.250000 7994.250000 3.110000
(120, 150] 3.570000 9.285714 148.142857 12034.857143 3.478571
(150, 180] 3.215000 9.200000 98.000000 8783.000000 3.055000
(180, 210] 3.270000 9.300000 171.000000 16278.000000 3.350000
volkswagen (90, 120] 3.118000 14.800000 104.200000 8835.000000 3.400000
(120, 150] 3.130000 13.500000 109.500000 11129.166667 3.400000
(240, 270] 3.190000 8.500000 109.000000 9980.000000 3.400000
volvo (60, 90] 3.726667 8.833333 137.333333 16293.333333 3.150000
(90, 120] 3.638750 10.750000 144.125000 18726.875000 3.146250
All 3.329701 10.164279 126.875622 13207.129353 3.261741
In [17]:
automobile.pivot_table(values=["price"],index=["make"],columns="fuel_type",aggfunc=sum).plot()
plt.show()
In [18]:
round(automobile.groupby("make")["price","normalized_losses","length","width","height",
                                 "curb_weight","horsepower","peak_rpm","city_mpg","highway_mpg"].mean())
Out[18]:
price normalized_losses length width height curb_weight horsepower peak_rpm city_mpg highway_mpg
make
alfa-romero 15498.0 168.0 170.0 65.0 50.0 2640.0 125.0 5000.0 20.0 27.0
audi 17859.0 162.0 185.0 69.0 55.0 2759.0 114.0 5500.0 19.0 24.0
bmw 26119.0 170.0 184.0 66.0 55.0 2929.0 139.0 5069.0 19.0 25.0
chevrolet 6007.0 100.0 152.0 62.0 52.0 1757.0 63.0 5300.0 41.0 46.0
dodge 7875.0 133.0 161.0 64.0 52.0 2151.0 86.0 5389.0 28.0 34.0
honda 8185.0 103.0 161.0 64.0 53.0 2097.0 80.0 5754.0 30.0 35.0
isuzu 8916.0 110.0 172.0 64.0 52.0 2536.0 84.0 4900.0 24.0 29.0
jaguar 34600.0 125.0 197.0 70.0 51.0 4027.0 205.0 4833.0 14.0 18.0
mazda 10653.0 123.0 171.0 66.0 53.0 2298.0 86.0 5109.0 26.0 32.0
mercedes-benz 33647.0 114.0 195.0 71.0 56.0 3696.0 146.0 4488.0 18.0 21.0
mercury 16503.0 140.0 178.0 68.0 55.0 2910.0 175.0 5000.0 19.0 24.0
mitsubishi 9240.0 145.0 168.0 65.0 51.0 2382.0 104.0 5269.0 25.0 31.0
nissan 10416.0 135.0 171.0 65.0 54.0 2400.0 103.0 5178.0 27.0 33.0
peugot 15489.0 161.0 191.0 68.0 57.0 3221.0 100.0 4668.0 22.0 27.0
plymouth 7963.0 131.0 165.0 64.0 52.0 2221.0 87.0 5357.0 28.0 34.0
porsche 31400.0 142.0 169.0 66.0 51.0 2772.0 191.0 5800.0 18.0 26.0
renault 9595.0 129.0 179.0 67.0 53.0 2520.0 90.0 5500.0 23.0 31.0
saab 15223.0 127.0 187.0 66.0 56.0 2746.0 127.0 5333.0 20.0 27.0
subaru 8541.0 92.0 169.0 65.0 54.0 2316.0 86.0 4775.0 26.0 31.0
toyota 9886.0 111.0 172.0 65.0 54.0 2441.0 93.0 4859.0 28.0 33.0
volkswagen 10078.0 125.0 173.0 66.0 55.0 2343.0 81.0 5154.0 29.0 35.0
volvo 18063.0 91.0 189.0 68.0 56.0 3038.0 128.0 5291.0 21.0 26.0
In [19]:
# automobile.dtypes
for decimal in automobile:
    if automobile[decimal].dtypes=="int64":
        automobile[decimal]=automobile[decimal].astype(float)
        print(automobile[decimal].dtypes)
float64
float64
float64
float64
float64
float64
float64
float64
float64
In [20]:
automobile=automobile.select_dtypes(exclude=["object"])
x=automobile.drop("price",axis=1)
y=automobile["price"]
print(x,y)
     symboling  normalized_losses  wheel_base  length  width  height  \
0          3.0              168.0        88.6   168.8   64.1    48.8   
1          3.0              168.0        88.6   168.8   64.1    48.8   
2          1.0              168.0        94.5   171.2   65.5    52.4   
3          2.0              164.0        99.8   176.6   66.2    54.3   
4          2.0              164.0        99.4   176.6   66.4    54.3   
..         ...                ...         ...     ...    ...     ...   
196       -1.0               95.0       109.1   188.8   68.9    55.5   
197       -1.0               95.0       109.1   188.8   68.8    55.5   
198       -1.0               95.0       109.1   188.8   68.9    55.5   
199       -1.0               95.0       109.1   188.8   68.9    55.5   
200       -1.0               95.0       109.1   188.8   68.9    55.5   

     curb_weight  engine_size  bore  stroke  compression_ratio  horsepower  \
0         2548.0        130.0  3.47    2.68                9.0       111.0   
1         2548.0        130.0  3.47    2.68                9.0       111.0   
2         2823.0        152.0  2.68    3.47                9.0       154.0   
3         2337.0        109.0  3.19    3.40               10.0       102.0   
4         2824.0        136.0  3.19    3.40                8.0       115.0   
..           ...          ...   ...     ...                ...         ...   
196       2952.0        141.0  3.78    3.15                9.5       114.0   
197       3049.0        141.0  3.78    3.15                8.7       160.0   
198       3012.0        173.0  3.58    2.87                8.8       134.0   
199       3217.0        145.0  3.01    3.40               23.0       106.0   
200       3062.0        141.0  3.78    3.15                9.5       114.0   

     peak_rpm  city_mpg  highway_mpg  
0      5000.0      21.0         27.0  
1      5000.0      21.0         27.0  
2      5000.0      19.0         26.0  
3      5500.0      24.0         30.0  
4      5500.0      18.0         22.0  
..        ...       ...          ...  
196    5400.0      23.0         28.0  
197    5300.0      19.0         25.0  
198    5500.0      18.0         23.0  
199    4800.0      26.0         27.0  
200    5400.0      19.0         25.0  

[201 rows x 15 columns] 0      13495.0
1      16500.0
2      16500.0
3      13950.0
4      17450.0
        ...   
196    16845.0
197    19045.0
198    21485.0
199    22470.0
200    22625.0
Name: price, Length: 201, dtype: float64
In [21]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.25,random_state=3)
In [22]:
print(xtrain.head())
     symboling  normalized_losses  wheel_base  length  width  height  \
103        1.0              231.0        99.2   178.5   67.9    49.7   
89         1.0              122.0        94.5   165.3   63.8    54.5   
133        2.0              104.0        99.1   186.6   66.5    56.1   
109        0.0              161.0       107.9   186.7   68.4    56.7   
170       -1.0               65.0       102.4   175.6   66.5    54.9   

     curb_weight  engine_size  bore  stroke  compression_ratio  horsepower  \
103       3139.0        181.0  3.43    3.27                9.0       160.0   
89        1938.0         97.0  3.15    3.29                9.4        69.0   
133       2847.0        121.0  3.54    3.07                9.0       160.0   
109       3252.0        152.0  3.70    3.52               21.0        95.0   
170       2480.0        110.0  3.27    3.35               22.5        73.0   

     peak_rpm  city_mpg  highway_mpg  
103    5200.0      19.0         25.0  
89     5200.0      31.0         37.0  
133    5500.0      19.0         26.0  
109    4150.0      28.0         33.0  
170    4500.0      30.0         33.0  
In [23]:
print(xtest.head())
     symboling  normalized_losses  wheel_base  length  width  height  \
40         0.0               85.0        96.5   175.4   65.2    54.1   
51         1.0              113.0        93.1   166.8   64.2    54.1   
140        0.0              102.0        97.0   172.0   65.4    54.3   
132        3.0              150.0        99.1   186.6   66.5    56.1   
171       -1.0               65.0       102.4   175.6   66.5    53.9   

     curb_weight  engine_size  bore  stroke  compression_ratio  horsepower  \
40        2465.0        110.0  3.15    3.58                9.0       101.0   
51        1950.0         91.0  3.08    3.15                9.0        68.0   
140       2385.0        108.0  3.62    2.64                9.0        82.0   
132       2808.0        121.0  3.54    3.07                9.0       160.0   
171       2414.0        122.0  3.31    3.54                8.7        92.0   

     peak_rpm  city_mpg  highway_mpg  
40     5800.0      24.0         28.0  
51     5000.0      31.0         38.0  
140    4800.0      24.0         25.0  
132    5500.0      19.0         26.0  
171    4200.0      27.0         32.0  
In [24]:
from sklearn.tree import DecisionTreeRegressor
dtr=DecisionTreeRegressor()
dtr.fit(xtrain,ytrain)
Out[24]:
DecisionTreeRegressor()
In [25]:
ypred=dtr.predict(xtest)
ypred
Out[25]:
array([ 8013.,  6695., 11259., 18620., 11248.,  8845.,  6938., 45400.,
       45400., 13499., 17669.,  9370.,  6692.,  7898.,  9298., 15998.,
        7499.,  8948.,  9995.,  8845., 36880., 33278., 20970., 16845.,
        8013.,  9298., 15750.,  8449., 18399.,  6649., 28176.,  6938.,
        9258.,  7099.,  9095.,  9980., 11900., 18420., 28248.,  5389.,
       12170.,  5389.,  6575., 36880.,  9370., 12170., 12964., 18920.,
        6229.,  7788.,  6295.])
In [26]:
pd.DataFrame({"Actual":ytest,"Predicted":ypred})
Out[26]:
Actual Predicted
40 12945.0 8013.0
51 7395.0 6695.0
140 9233.0 11259.0
132 18150.0 18620.0
171 9988.0 11248.0
82 6989.0 8845.0
153 7198.0 6938.0
46 36000.0 45400.0
70 40960.0 45400.0
100 13499.0 13499.0
151 8778.0 17669.0
83 8189.0 9370.0
25 7609.0 6692.0
139 9960.0 7898.0
160 8238.0 9298.0
174 16558.0 15998.0
95 8249.0 7499.0
3 13950.0 8948.0
41 10345.0 9995.0
58 10595.0 8845.0
14 30760.0 36880.0
125 37028.0 33278.0
12 21105.0 20970.0
6 17710.0 16845.0
143 10198.0 8013.0
162 9538.0 9298.0
176 15690.0 15750.0
164 9639.0 8449.0
101 17199.0 18399.0
86 5499.0 6649.0
64 25552.0 28176.0
47 5195.0 6938.0
159 8058.0 9258.0
34 7295.0 7099.0
38 8845.0 9095.0
185 11595.0 9980.0
4 17450.0 11900.0
72 16503.0 18420.0
67 31600.0 28248.0
146 5348.0 5389.0
128 11850.0 12170.0
115 5572.0 5389.0
156 7738.0 6575.0
15 41315.0 36880.0
61 11245.0 9370.0
130 15040.0 12170.0
121 12764.0 12964.0
198 21485.0 18920.0
23 6229.0 6229.0
154 7898.0 7788.0
31 5399.0 6295.0
In [27]:
from sklearn import metrics
print("mean absolute error:",round(metrics.mean_absolute_error(ytest,ypred)))
print("mean squared error:",round(metrics.mean_squared_error(ytest,ypred)))
print("root mean squared error:", round(np.sqrt(metrics.mean_squared_error(ytest,ypred))))
mean absolute error: 1919
mean squared error: 8225198
root mean squared error: 2868
In [28]:
plt.hist(y)
Out[28]:
(array([81., 45., 35., 17.,  6.,  3.,  4.,  7.,  2.,  1.]),
 array([ 5118. ,  9146.2, 13174.4, 17202.6, 21230.8, 25259. , 29287.2,
        33315.4, 37343.6, 41371.8, 45400. ]),
 <BarContainer object of 10 artists>)
In [29]:
plt.figure(figsize=(15,5))
sns.distplot(y,hist=False,label="Actual",color="r")
sns.distplot(ypred,hist=False,label="Predicted",color="b")
plt.show()
# plt.close()
In [35]:
from sklearn import tree
plt.figure(figsize=(15,8))
tree.plot_tree(dtr,filled=True,rounded=True,fontsize=7)
plt.show()
In [36]:
from sklearn.ensemble import RandomForestRegressor
In [37]:
rfr=RandomForestRegressor(n_estimators=15,random_state=4)
In [38]:
rfr.fit(xtrain,ytrain)
Out[38]:
RandomForestRegressor(n_estimators=15, random_state=4)
In [39]:
y_pred=rfr.predict(xtest)
y_pred
Out[39]:
array([11063.73333333,  6700.8       ,  9798.73333333, 18254.73333333,
       10915.93333333,  8735.08888889,  7740.8       , 37940.        ,
       40052.93333333, 15662.93333333, 17682.46666667,  8676.88888889,
        6604.66666667,  8946.53333333,  8435.2       , 17125.86666667,
        7495.66666667,  9705.96666667,  9263.33333333,  8860.82222222,
       33571.26666667, 31249.93333333, 19366.86666667, 19862.53333333,
        9339.        , 10020.16666667, 17016.8       ,  8964.73333333,
       19401.        ,  6885.66666667, 28512.73333333,  6482.86666667,
        8531.4       ,  7166.        ,  9582.66666667,  9613.4       ,
       16411.13333333, 17341.46666667, 32338.86666667,  6567.73333333,
       13807.16666667,  6120.46666667,  7308.66666667, 33196.93333333,
        9256.94444444, 13772.33333333, 13545.66666667, 20355.6       ,
        6453.6       ,  8038.26666667,  5872.06666667])
In [40]:
pd.DataFrame({"Actual": ytest,"Prediction":y_pred})
Out[40]:
Actual Prediction
40 12945.0 11063.733333
51 7395.0 6700.800000
140 9233.0 9798.733333
132 18150.0 18254.733333
171 9988.0 10915.933333
82 6989.0 8735.088889
153 7198.0 7740.800000
46 36000.0 37940.000000
70 40960.0 40052.933333
100 13499.0 15662.933333
151 8778.0 17682.466667
83 8189.0 8676.888889
25 7609.0 6604.666667
139 9960.0 8946.533333
160 8238.0 8435.200000
174 16558.0 17125.866667
95 8249.0 7495.666667
3 13950.0 9705.966667
41 10345.0 9263.333333
58 10595.0 8860.822222
14 30760.0 33571.266667
125 37028.0 31249.933333
12 21105.0 19366.866667
6 17710.0 19862.533333
143 10198.0 9339.000000
162 9538.0 10020.166667
176 15690.0 17016.800000
164 9639.0 8964.733333
101 17199.0 19401.000000
86 5499.0 6885.666667
64 25552.0 28512.733333
47 5195.0 6482.866667
159 8058.0 8531.400000
34 7295.0 7166.000000
38 8845.0 9582.666667
185 11595.0 9613.400000
4 17450.0 16411.133333
72 16503.0 17341.466667
67 31600.0 32338.866667
146 5348.0 6567.733333
128 11850.0 13807.166667
115 5572.0 6120.466667
156 7738.0 7308.666667
15 41315.0 33196.933333
61 11245.0 9256.944444
130 15040.0 13772.333333
121 12764.0 13545.666667
198 21485.0 20355.600000
23 6229.0 6453.600000
154 7898.0 8038.266667
31 5399.0 5872.066667
In [49]:
print("MSE :", round(metrics.mean_squared_error(ytest,y_pred)))
print("MAE:", round(metrics.mean_absolute_error(ytest,ypred)))
print("SRMSE :",round( np.sqrt(metrics.mean_squared_error(ytest,ypred))))
MSE : 5457224
MAE: 1919
SRMSE : 2868
In [60]:
plt.figure(figsize=(15,5))
sns.distplot(y,hist=False,color="red")
sns.distplot(y_pred,hist=False,color="blue")
plt.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: